Udemy Course Recommender
Fetching Business course data from Udemy using official Udemy API 2.0, cleaning the data, exploring and building content-based course recommendation system
- Setup
- Data Import
- Data Cleaning
- EDA
- Clustering and Recommender system
- Clustering with the OBJECTIVES
- Clustering with the description
- Building the recommender system
Udemy.com is an online learning platform with more than 100.000 courses and over 30 million students all over the world. The platform offers courses in different categories e.g. Business, Design or Marketing. With all the available options it is very hard to choose the proper course, since everyone has a different taste. A recommender system helps students choose the next course, without spending hours reading different course descriptions. It does not only spare time for the user, but helps to find something interesting based on their previous course choices.
!pip install squarify
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import requests
import os
import requests
import ast
import pickle
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from operator import itemgetter
from collections import Counter
import matplotlib
import squarify
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import euclidean_distances
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import datetime
import scipy.stats as st
import ast
import re
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
username='oZAbbHY1iopJmQlRBmUOvepJVAmEadBfLARYO42N'
pw = os.environ['udemy_client_secret']
list_json=[]
url='https://www.udemy.com/api-2.0/courses/?fields[course]=@all&page=1&category=Business'
global_counter = 0
local_counter = 0
while url!=None:
if not os.path.exists(f"./courses_{global_counter+10}.txt"):
try:
local_counter+=1
data_json=get_data(url, username, pw)
url=data_json['next']
list_json.extend(data_json['results'])
if local_counter%10==0:
local_counter = 0
global_counter+=10
with open(f"./courses_{global_counter}.txt", "wb") as fp:
pickle.dump(list_json, fp)
list_json = []
print("Stored {} results!".format(global_counter))
except:
print(global_counter)
continue
else:
global_counter+=10
print("Stored {} results!".format(global_counter))
import glob
all_chunks = glob.glob("./*.txt")
list_json = []
for chunk in all_chunks:
with open(chunk, "rb") as fp:
list_json.extend(pickle.load(fp))
len(list_json)
df_courses = pd.DataFrame.from_dict(list_json)
df_courses.to_csv('df_courses.csv')
df_courses.head()
df_courses.describe(include=['O'])
local_counter = 0
global_counter = 0
for j, id_ in enumerate(df_courses['id'].values):
url="https://www.udemy.com/api-2.0/courses/{}/reviews/?page=1&page_size=100".format(id_)
list_json_review=[]
while url!=None:
try:
data_json=get_data(url, username, pw)
url=data_json['next']
list_json_review.extend(data_json['results'])
if local_counter%100==0:
local_counter = 0
global_counter+=100
with open(f"reviews_{global_counter}.txt", "wb") as fp:
pickle.dump(list_json_review, fp)
list_json_review = []
print("Stored {} results!".format(global_counter))
except:
continue
if j==0:
df_review= pd.DataFrame.from_dict(list_json_review)
df_review['id']=id_
else:
df_review_unique = pd.DataFrame.from_dict(list_json_review)
df_review_unique['id']=id_
df_review = pd.concat([df_review, df_review_unique])
import glob
all_chunks = glob.glob("./Udemyreviews/*.txt")
list_json_review = []
for chunk in all_chunks:
with open(chunk, "rb") as fp:
list_json_review.extend(pickle.load(fp))
df_review = pd.DataFrame.from_dict(list_json_review)
df_review.to_csv('df_review.csv')
df_courses.to_parquet('udemy_courses.parquet.gzip', compression='gzip')
df_review.to_parquet('udemy_reviews.parquet.gzip', compression='gzip')
Through the data cleaning process I did the following operations on the raw dataset:
- import the raw data
- transform the relevant columns
- filter the dataset
- keep only the relevant columns
- drop the duplicates
- treat the missing values
- save the cleaned data
df_courses = pd.read_parquet("https://github.com/sparsh-ai/reco-data/raw/master/udemy/udemy_courses.parquet.gzip")
df_courses.shape
df_courses.head()
df_courses.columns.tolist()
df = df_courses.copy()
df['primary_category']=transform_col(df['primary_category'], 'title')
df['primary_subcategory']=transform_col(df['primary_subcategory'], 'title')
df['content_info']=df['content_info'].apply(get_float)
df['price']=df['price'].apply(get_float)
df['published_time']=pd.to_datetime(df['published_time']).dt.tz_convert(None)
df['published_since_month']=(datetime.datetime.now()-df['published_time']).apply(lambda x: int(x.days/30))
df['objectives']=transform_col(df['objectives'])
df['description'] = df['description'].fillna('description not available')
df['description_text']=df['description'].apply(remove_tags)
rating_orig=[]
rating_rel=[]
for i, rating in enumerate(df['rating_distribution'].values):
total=0
temp={}
temp_rel={}
if rating:
rating=ast.literal_eval(rating)
for rating_j in rating:
j=rating_j['rating']
count_j=rating_j['count']
total+=count_j
temp[j]=count_j
rating_orig.append(temp)
if total>0:
for k,v in temp.items():
temp_rel[k]=round(v*1.0/total,3)
rating_rel.append(temp_rel)
else:
rating_rel.append({1:0, 2:0, 3:0, 4:0, 5:0})
else:
rating_rel.append({1:0, 2:0, 3:0, 4:0, 5:0})
rating_orig.append({1:0, 2:0, 3:0, 4:0, 5:0})
df_rating=pd.DataFrame(rating_rel)
df_rating.columns=['rating_1', 'rating_2', 'rating_3', 'rating_4','rating_5']
df=pd.concat([df, df_rating], axis=1)
df.shape
df=df[(df['is_published']== True ) & (df['status_label']== 'Live')]
#drop the columns that are transformed or not relevant any more
df.drop(columns=['published_time','rating_distribution','status_label', 'is_published', 'rating', 'description' ], axis=1, inplace=True)
df.shape
cols=['avg_rating', 'avg_rating_recent', 'description_text', 'has_certificate', 'is_paid',
'id', 'instructional_level', 'is_enrollable_on_mobile', 'is_owned_by_instructor_team', 'is_practice_test_course',
'num_article_assets' , 'num_curriculum_items','num_lectures', 'num_practice_tests', 'num_quizzes',
'num_subscribers', 'num_reviews', 'objectives', 'price','published_title', 'relevancy_score','rating_1',
'rating_2', 'rating_3', 'rating_4','rating_5', 'published_since_month', 'primary_category', 'primary_subcategory' ]
df=df[cols]
df.shape
df=df.drop_duplicates(subset='id', keep='first')
df.shape
df.isnull().sum()
#The free courses are labeled as free -> change price for these courses: 0
df['price']=df['price'].fillna(0)
# drop relevancy_score
df = df.drop('relevancy_score', axis=1)
#drop the missings
df.dropna(how='any', inplace=True)
#in the objectives, there are empty lists
index_to_drop=df[df['objectives'].apply(lambda x: x==list([]))].index
df.drop(index=index_to_drop, inplace=True)
df.shape
df.to_csv('df_courses.csv', sep=' ')
df_review_raw = pd.read_parquet("https://github.com/sparsh-ai/reco-data/raw/master/udemy/udemy_reviews.parquet.gzip")
df_review_raw.shape
df_review_raw.head()
df_review=df_review_raw.loc[:, ~df_review_raw.columns.str.match('Unnamed')]
df_review['user_name']=transform_col(df_review['user'], 'display_name')
df_review['user_title']=transform_col(df_review['user'], 'title')
df_review=df_review[~df_review['user_name'].isin(['Anonymized User', 'Private Udemy For Business User', 'Udemy User'])]
cols=['id', 'created', 'rating', 'user_name']
df_review=df_review[cols]
#the user names in the reviews data are not unique, it is impossible to build a recommender system based on the user ratings
df_review.drop_duplicates(inplace=True)
df_review.isnull().sum()
#no missing values
df_review.to_csv('df_reviews.csv')
Most important findings on the course dataset:
- there are courses with no reviews/ratings, but most of them are between rating 4 and 4.5
- The price ranges between 0 and 199 EUR
- There are some really popular courses with a lot of subscribers. The top 3 are:
- machinelearning with more than 300T subscribers - python-for-data-science-and-machine-learning-bootcamp with 192T subscribers - an-entire-mba-in-1-courseaward-winning-business-school-prof with 187T subscribers - Most courses don't have any quizzes or practice tests
- The number of lectures mostly vary between 13 and 37 (IQR)
- The average age of a course is 26 months (since it was published). There are more recently published courses than older ones.
- The majority of the courses is for all levels. Only a few courses requires an advanced level.
- The courses are divided into 16 subcategories, whereas the two most significant are Finance and Entrepreneurship.
- Two subcategories have an average price higher than 100 dollars : The subcategory Data & analytics with 112, and Project Management with 104 - The total earning on the courses is the highest in the subcategory for Data & Analytics and the second is in Entrepreneurship. - The total number of subscribers are the highest in the category of Entrepreneurship (1.) and in Data & Analytics (2.) - There is not much difference between the average ratings of the courses in each subcategory. The highest average ratings are in the subcategories Media and Communications. - I investigated the top words in each subcategories in the attributes objectives and description separately. E.g. in the subcategory Data& Analytics, the top 5 words are:
- data, use, model, understand, create
After the univariate analysis I also executed multivariate analysis:
- There is a positive correlation between the number of reviews/number of subscribers and the average rating - students normally give good ratings for courses they liked
- As expected, there is a positive correlation between number of subscribers and number of reviews
- There is also a positive correlation between published since and the average rating -> older courses have better ratings. This seems logic, since I would expect that courses which aren't popular won't stay long on the sortiment
- The price doesn't have an effect on the average ratings or on the number of subsribers
Most important findings on the reviews dataset:
- The users are unfortunately not unique. Because if this reason, it is not possible to build a recommender system on the user ratings.
-
- Most users (more than 600.000) gave only one review, but there are couple user_names, who have plenty of reviews: the most common username is David with more than 400 reviews.
- Most courses have very few reviews
import pandas as pd
import numpy as np
import ast
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
#for the text attributes
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from operator import itemgetter
from nltk.stem import SnowballStemmer
df_courses = pd.read_csv('df_courses.csv', index_col=0, sep=' ', converters={"objectives": ast.literal_eval})
df_courses.head()
df_courses.describe()
There are around 900 courses with no reviews/ratings, but most of the ratings are between rating 4 and 4.5.
plt.hist(df_courses['avg_rating'], bins=50)
plt.xlabel('Number of courses')
plt.ylabel('Average rating')
plt.title('Distribution of average rating')
plt.savefig('avg_rating.png')
The price ranges between 0 and 199 EUR. Most courses cost eiter 19.99 or 199.99 $.
plt.hist(df_courses['price'], bins=50)
plt.xlabel('Number of courses')
plt.ylabel('Price')
plt.title('Distribution of price')
plt.savefig('price.png')
I checked which courses are most visited. Courses with the top 10 most subscribers can be seen below:
top10_courses= df_courses.sort_values('num_subscribers', ascending=False)[['published_title', 'num_subscribers']].head(10)
for i, row in top10_courses.iterrows():
print('The course {} has {} subscribers.'.format(row['published_title'],row['num_subscribers']))
fig, ax= plt.subplots(figsize=(8,5))
ax.barh(np.arange(len(top10_courses)), top10_courses['num_subscribers'], alpha=0.6)
plt.yticks(np.arange(len(top10_courses)), top10_courses['published_title'])
plt.title('Top 10 courses with most subscribers')
ax.set_xlabel('Number of subscribers')
plt.savefig('top10courses.png')
I plotted a histogram and a boxplot from each numerical attribute. Some of the features has outliers, and the distribution is skewed.
var_num=['avg_rating', 'avg_rating_recent','num_article_assets' , 'num_curriculum_items',
'num_lectures', 'num_practice_tests', 'num_quizzes','num_subscribers', 'num_reviews', 'price',
'published_since_month', 'rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5']
for col in var_num:
fig, ax= plt.subplots(1,2,figsize=(16,4))
ax[0].hist(df_courses[col], bins=20)
ax[1].boxplot(df_courses[col])
ax[0].set_title('Distribution of '+ str(col))
ax[1].set_title('Boxplot of '+ str(col))
print('Number of 0 values of attribute {} is {}.'.format(col, len(df_courses[df_courses[col]==0])))
I defined all data points, whose distance from the mean is more than 3*standard deviation, as outliers. I checked the distribution without these outliers. I didn't excluded these outliers from the data, I only excluded them to have a better understanding of the distribution of the features.
var_num=['avg_rating', 'avg_rating_recent','num_article_assets' , 'num_curriculum_items','num_lectures',
'num_practice_tests', 'num_quizzes','num_subscribers', 'num_reviews', 'price', 'published_since_month']
excluded_all=[]
for col in var_num:
mean=df_courses[col].mean()
std=df_courses[col].std()
temp=df_courses[(df_courses[col]>mean-3*std) & (df_courses[col]<mean+3*std)]
excluded_all.extend(list(set(df_courses.index)-set(temp.index)))
fig, ax= plt.subplots(1,2,figsize=(16,4))
ax[0].hist(temp[col], bins=20)
ax[1].boxplot(temp[col])
ax[0].set_title('Distribution of '+ str(col))
ax[1].set_title('Boxplot of '+ str(col))
print('Number of dropped values of attribute {} is {}.'.format(col, len(df_courses)-len(temp)))
excluded=set(excluded_all)
corr = df_courses[var_num].corr()
fig, ax=plt.subplots(figsize=(10,7))
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
cols=['avg_rating','num_subscribers','published_since_month', 'num_reviews', 'price']
temp=df_courses[~df_courses.index.isin(excluded)]
sns.pairplot(temp[cols], plot_kws= {'alpha': 0.2})
The followings can be seen from the pairplot above:
- There is a positive correlation between the number of reviews/number of subscribers and the average rating - students normally give better ratings
- As expected, there is a positive correlation between number of subscribers and number of reviews
- There is also a positive correlation between published since and the average rating -> older courses have better ratings. This seems logic, since I would expect that courses which aren't popular won't stay long on the sortiment
- The price doesn't have an effect on the average ratings
sns.pairplot(df_courses[['rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5']], markers="+", plot_kws= {'alpha': 0.2})
var_char=['has_certificate', 'is_paid', 'instructional_level', 'is_enrollable_on_mobile','is_owned_by_instructor_team',
'is_practice_test_course', 'primary_category', 'primary_subcategory' ]
for col in var_char:
temp=df_courses[col].value_counts()
x_labels=temp.index
plt.figure(figsize=(8, 4))
ax = temp.plot(kind='bar', alpha=0.4)
ax.set_title(col)
ax.set_ylabel('Number of courses')
ax.set_xticklabels(x_labels)
rects = ax.patches
labels = list(temp.values/temp.values.sum()*100)
labels=[str(round(l,0))+'%' for l in labels]
#for rect, label in zip(rects, labels):
#height = rect.get_height()
#ax.text(rect.get_x() + rect.get_width() / 2, height + 5, label, ha='center', va='bottom')
plt.show()
df_courses['earnings']=df_courses['price']*df_courses['num_subscribers']
df_subcategories=df_courses.groupby('primary_subcategory').agg({'num_subscribers':'sum',
'avg_rating': 'mean',
'price': 'mean',
'earnings': 'sum'})
df_subcategories
titles=['Total number of subscribers', 'Average rating of courses', 'Total earning on courses', 'Average price of courses']
fig, ax= plt.subplots(2,2,figsize=(16,10))
num=0
for i, col in enumerate(df_subcategories.columns):
num+=1
ax= plt.subplot(2,2, num)
df_subcategories[col].plot(kind='bar', ax=ax, alpha=0.5)
plt.title(titles[i])
if num in range(3) :
plt.tick_params(labelbottom='off')
plt.show()
- Two subcategories have an average price higher than 100 dollars : The subcategory Data & analytics with 112, and Project Management with 104
- The total earning on the courses is the highest in the subcategory for Data & Analytics and the second is in Entrepreneurship.
- The total number of subscribers are the highest in the category of Entrepreneurship and in Data & Analytics
I will analyse the attribute objectives of the courses to get a better understanding about the courses. At first I needed to transform the list of objectives into one string, and then investigate the frequencies of each word. I also implemented stemming: for that, I created a dataframe, where thee indexes are the stemmed words, and the values are the words which were stemmed. I needed it, to transform back the stemmed words. By means of the stemming similar words were counted as the same word (e.g. the words learn and learning are treated as one word).
import nltk
nltk.download('punkt')
nltk.download('stopwords')
objectives_text=df_courses['objectives'].apply(combine_list)
vocab_frame_orig=vocab_stem(objectives_text)
vocab_frame_orig.head()
vocab_frame=drop_words(vocab_frame_orig)
StopWords=set(stopwords.words('english')+list(punctuation)+["’", "n't", "'s", "--", "-", "...", "``", "''", "“", "039"])
top_words_graph(df_courses, 'objectives', True, 'bar', StopWords, vocab_frame)
top_words_graph(df_courses, 'objectives', True, 'wordcloud', StopWords, vocab_frame)
vocab_frame_descr=vocab_stem(df_courses['description_text'])
vocab_frame_descr.head()
vocab_frame_descr=drop_words(vocab_frame_descr)
top_words_graph(df_courses, 'description_text', False, 'bar', StopWords, vocab_frame_descr)
top_words_graph(df_courses, 'description_text', False, 'wordcloud', StopWords, vocab_frame_descr)
df_reviews=pd.read_csv('df_reviews.csv', index_col=0)
df_reviews.head()
nr_user=df_reviews['user_name'].value_counts()
unique, counts = np.unique(nr_user, return_counts=True)
#dict(zip(unique, counts))
#most users (more than 600000) have only 1 review
ax, fig= plt.subplots(figsize=(10,4))
plt.bar(np.arange(len(counts[:20])), counts[:20], align='center')
plt.xticks(np.arange(len(counts[:20])), unique[:20])
plt.xlabel('number of reviews per user')
plt.ylabel('number of users')
plt.title('Number of reviews per user')
plt.show()
ax, fig= plt.subplots(figsize=(10,4))
nr_user[:20].plot(kind='bar', alpha=0.4)
In this section, I cluster the courses and based on the new clusters and other course features, I build a recommender system.
For the clustering I investigated the attributes OBJECTIVES and DESCRIPTION. After the preparation of these two attributes, the first part of the notebook tries to cluster the courses based on the attribute OBJECTIVES, while in the second part I build the clusters by means of the course DESCRIPTIONs. After comparing the results, I used the the clustering algorithm based on the description field. The last part of the notebook shows the recommender system, that helps the user to find similar courses to the previously taken ones.
import pandas as pd
import numpy as np
import ast
import scipy.stats as st
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import pickle
df_courses=pd.read_csv('df_courses.csv', index_col=0, sep=' ', converters={"objectives": ast.literal_eval})
df_courses.head()
df_reviews=pd.read_csv('df_reviews.csv', index_col=0)
df_reviews.head()
The feature Objectives is a list of course objectives. At first I make a string from the list items by means of the function combine_list.
For the stemming I saved all words with their stemmed correspondence in the dataframe vocab_frame. Since I am interested only in the stemmed words, I dropped all the duplicates from this dataframe (e.g. I treat learn and learning as the same words). This dataframe will be used to transform back the stemmed words.
I defined the StopWords which contains all the expression that shouldn't be considered from the texts.
Finally I applied the TfidfVectorizer on the objectives attribute: This transformator builds feature vectors from text documents so, that it helps to identify words which are frequent in the text but rare in the corpus.
objectives_text=df_courses['objectives'].apply(combine_list)
vocab_frame_orig=vocab_stem(objectives_text)
#drop duplicates from the dataframe with stemmed words
vocab_frame=drop_words(vocab_frame_orig)
StopWords=set(stopwords.words('english')+list(punctuation)+["’", "n't", "'s", "--", "-", "...", "``", "''", "“", "039"])
vectorizer= TfidfVectorizer(stop_words=StopWords, tokenizer=tokenize, max_features=1000, max_df=0.8)
X=vectorizer.fit_transform(objectives_text)
X.shape
word_features = vectorizer.get_feature_names()
word_features[50:55]
I executed the same steps as by the attribute Objectives except the combine_list functions: the attribute Description is alredy a string and not a list.
vocab_frame_descr=vocab_stem(df_courses['description_text'])
vocab_frame_descr=drop_words(vocab_frame_descr)
StopWords=set(stopwords.words('english')+list(punctuation)+["’", "n't", "'s", "--", "-", "...", "``", "''", "“", "039"])
vectorizer_descr= TfidfVectorizer(stop_words=StopWords, tokenizer=tokenize, max_features=1000, max_df=0.8)
X_descr=vectorizer_descr.fit_transform(df_courses['description_text'])
X_descr.shape
word_features_descr = vectorizer_descr.get_feature_names()
word_features_descr[50:55]
At first I tried to create 15 clusters - there are 16 subcategories, but no need for category 'others'.
kmeans = KMeans(n_clusters = 15, n_init = 10, n_jobs = -1, random_state=1234)
kmeans.fit(X)
common_words=get_common_words(kmeans, 10)
print_common_words(common_words, word_features, vocab_frame)
squarify_words(common_words, word_features, vocab_frame)
df_courses['cluster']=kmeans.labels_
heatmap_categories_cluster('cluster', df_courses, 'Reds' )
I investigated the relationship between the number of clusters and the inertia (within-cluster sum-of-squares ) to find to optimal number of clusters. According to the elbow method, the line is an arm and the "elbow" on the arm is the value of k that is the best.
kRange = range(1,30)
inertia_Kmean = get_inertia(X, kRange)
plot_inertia(kRange, inertia_Kmean)
plt.plot([6], [inertia_Kmean[5]], 'o--', color='dimgray', linewidth=3)
plt.plot([1,6,11], [8520, 8170,7820], '--', color='k', linewidth=1)
plt.annotate("Let's try k=6", xy=(6, inertia_Kmean[5]), xytext=(6,7700),
size=14, weight='bold', color='dimgray',
arrowprops=dict(facecolor='dimgray', shrink=0.05))
It is hard to tell what is the optimal number of clusters from the graph. I tried several number of clusters and finally created 6 clusters with k-Means algorithm
kmeans = KMeans(n_clusters = 6, n_init = 10, n_jobs = -1, random_state=1234)
kmeans.fit(X)
common_words=get_common_words(kmeans, 10)
print_common_words(common_words, word_features, vocab_frame)
squarify_words(common_words, word_features, vocab_frame)
df_courses['cluster']=kmeans.labels_
heatmap_categories_cluster('cluster', df_courses, 'Reds')
plot_common_words(kmeans, 5, word_features, vocab_frame, df_courses, 'cluster')
print_titles_cluster(5, df_courses, 'cluster')
In this section I used hierarchical clustering. This method suppose that at the beginning the items have their own clusters. The algorithm starts to merge the individual clusters on by one. I created a dendrogram, which shows the distances between the clusters. I plotted the last 16 merges of the hierarchical clustering algorithm.
z=get_linkage(X )
plot_dendrogram(z, 16, line_dist=7.8)
#let's cut the dendrogrm at 7.8
df_courses['cluster_hier']=fcluster(Z=z, t=7.8, criterion='distance')
df_courses['cluster_hier'].value_counts()
heatmap_categories_cluster('cluster_hier', df_courses, 'Reds' )
The distribution of the clusters through hierarchical clustering is very unproportional.
I will do a simple PCA analysis and keep the first 2 principal components int order to plot the courses in 2D. I will use the results ofthe kmeans clustering (with 6 groups), since the hierarchical clustering resulted in an overproportional group.
plot_with_pca (X, df_courses['cluster'], 500)
In the 2-D plot, almost all the clusters are well separated from each other. Cluster0 and cluster5 overlap each other - in cluster 5 the majority of the courses can be found.
After building clusters with the objectives attribute, I investigated the course descriptions as the basis of the clustering algorithmns. I executed the same analyses and got better distributed clusters by means of the description feature.
At first I tried to create 15 clusters, similar to the previous clusterings witht the attribute objective. There are clusters with only a few courses, so I tried to optimize the number of clusters to build (k).
kmeans_descr = KMeans(n_clusters = 15, n_init = 10, n_jobs = -1, random_state=1234)
kmeans_descr.fit(X_descr)
common_words=get_common_words(kmeans_descr, 10)
print_common_words(common_words, word_features_descr, vocab_frame_descr)
squarify_words(common_words, word_features_descr, vocab_frame_descr)
df_courses['cluster_descr']=kmeans_descr.labels_
heatmap_categories_cluster('cluster_descr', df_courses, 'Reds')
df_courses['cluster_descr'].value_counts()
kRange = range(1,30)
inertia_Kmean = get_inertia(X_descr, kRange)
plot_inertia(kRange, inertia_Kmean)
plt.plot([8], [inertia_Kmean[7]], 'o--', color='dimgray', linewidth=3)
plt.plot([1,8,15], [8050, 7580,7110], '--', color='k', linewidth=1)
plt.annotate("Let's try k=8", xy=(8, inertia_Kmean[7]), xytext=(9,7800),
size=14, weight='bold', color='dimgray',
arrowprops=dict(facecolor='dimgray', shrink=0.05))
kmeans_descr = KMeans(n_clusters = 8, n_init = 10, n_jobs = -1, random_state=123456)
kmeans_descr.fit(X_descr)
common_words=get_common_words(kmeans_descr, 10)
print_common_words(common_words, word_features_descr, vocab_frame_descr)
squarify_words(common_words, word_features_descr, vocab_frame_descr)
df_courses['cluster_descr']=kmeans_descr.labels_
heatmap_categories_cluster('cluster_descr', df_courses, 'Reds')
plot_common_words(kmeans_descr, 5, word_features_descr, vocab_frame_descr, df_courses, 'cluster_descr')
print_titles_cluster(3, df_courses, 'cluster_descr')
z_descr=get_linkage(X_descr )
plot_dendrogram(z, 16, line_dist=7.8)
#let's cut the dendrogrm at 7.8
df_courses['cluster_hier_descr']=fcluster(Z=z_descr, t=9.6, criterion='distance')
df_courses['cluster_hier_descr'].value_counts()
heatmap_categories_cluster('cluster_hier', df_courses, 'Reds' )
The distribution of the clusters through hierarchical clustering is very unproportional with the attribute description as well. In the further analysis I will use results of the k-means clustering with k=8.
plot_with_pca(X_descr, df_courses['cluster_descr'], 1000)
filename ='kmeans8.sav'
pickle.dump(kmeans_descr, open(filename, 'wb'))
model_kmeans=pickle.load(open('kmeans8.sav', 'rb'))
model_kmeans
values=model_kmeans.predict(X_descr)
(values==df_courses['cluster_descr']).sum()
There are clusters, which are close to each other, e.g. cluster 3 is between clusters 0 and 8. Clusters 7, 4 and 1 are also adjacent. It is important to remember that I kept only 2 pricipal components, that explain 4% of the total variance (which is plotted on the graph). In contrast, the clusters are not reduced, they contain all the informations.
For the recommender system I use the course features together with the result of the k-means clustering with k=8. I transformed the course dataset into a features matrix by keeping only the relevant features (e.g. no need for course id) For the categorical variables I introduced dummy variables. The clusters were also transformed into dummy variables, since the order of the clusters doesn't have any meaning (cluster 0 is not better or worse than cluster 1). As the last step of the preparation I normalized the feature matrix, since the features have different scales. I used the cosine similarity to compare the courses which each other.
There are 2 functions, which can be used to recommend courses:
- Function recommend_for_user recommends courses for the user based on his/her previous courses. This function takes the user as input.
- Function recommend_courses recommends courses based on another course_id. This function takes the course_id as input and looks for the courses that are similar to the original course.
rel_cols=['avg_rating', 'has_certificate', 'instructional_level', 'num_lectures','num_quizzes',
'num_practice_tests','is_practice_test_course', 'num_article_assets', 'num_curriculum_items',
'num_subscribers','num_reviews', 'price', 'primary_subcategory','cluster_descr']
df_rel=df_courses[rel_cols]
df_rel['has_certificate']=df_rel['has_certificate'].astype(int)
df_rel['cluster_descr']=df_rel['cluster_descr'].astype(str)
dummies=pd.get_dummies(df_rel[['primary_subcategory', 'instructional_level','cluster_descr']], prefix=['subcat', 'level', 'cluster'])
df_rel.drop(columns=['primary_subcategory', 'instructional_level', 'cluster_descr'], inplace=True)
df_rel=pd.concat([df_rel,dummies], axis=1)
df_rel.head()
df_norm=normalize_features(df_rel)
nr_user=df_reviews['user_name'].value_counts()
unique, counts = np.unique(nr_user, return_counts=True)
#dict(zip(unique, counts))
#recommend_for_user(user_name)
nr_user.sort_values()[:10]
recommend_for_user('DEEPAK IYER', 5, df_reviews, df_courses, df_norm)
recommend_for_user('Henk Bergsma', 5,df_reviews, df_courses, df_norm)